Skip to content

Conversation

@jianyun8023
Copy link
Contributor

[Fix] #698 修复使用Redis,Request丢失附加信息问题

@jianyun8023
Copy link
Contributor Author

ci报JAVA_HOME找不到可以考虑使用openjdk7
我今天看你使用的ci,也去使用了它,参考你使用的oraclejdk7,也报同样的问题,换成openjdk7就好了
.travis.yml

@code4craft code4craft merged commit b0bf1a9 into code4craft:master Nov 30, 2017
@code4craft
Copy link
Owner

已合!另外,方法名不要大写,CheckForAdditionalInfo这个也改一下?

@jianyun8023
Copy link
Contributor Author

尴尬了,手残了,尴尬,你直接修改吧

@code4craft
Copy link
Owner

改掉了,另外pool.returnResource(jedis);也被废弃了,我改成了新的jedis.close()

@jianyun8023
Copy link
Contributor Author

基于spring data redis 弄得

另外我用你的爬虫+springboot+spring boot admin整合了一个简单的分布式爬虫监控

package cc.yihy.spider; import com.alibaba.fastjson.JSON; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.data.redis.core.HashOperations; import org.springframework.data.redis.core.ListOperations; import org.springframework.data.redis.core.RedisTemplate; import org.springframework.data.redis.core.SetOperations; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.scheduler.DuplicateRemovedScheduler; import us.codecraft.webmagic.scheduler.MonitorableScheduler; import us.codecraft.webmagic.scheduler.component.DuplicateRemover; import javax.annotation.Resource; @Component public class RedisDataScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler, DuplicateRemover { private static final String QUEUE_PREFIX = "queue_"; private static final String SET_PREFIX = "set_"; private static final String ITEM_PREFIX = "item_"; private static final String HEADER_PREFIX = "header_"; // inject the actual template @Autowired private RedisTemplate<String, String> redisTemplate; // inject the template as ListOperations @Resource(name = "redisTemplate") private SetOperations<String, String> setOps; @Resource(name = "redisTemplate") private ListOperations<String, String> listOps; @Resource(name = "redisTemplate") private HashOperations<String, String, String> hashOps; public RedisDataScheduler() { setDuplicateRemover(this); } /**  * 添加集合元素  *  * @param request  * @param task  * @return  */ @Override public boolean isDuplicate(Request request, Task task) { return setOps.add(getSetKey(task), request.getUrl()).intValue() == 0; } @Override protected void pushWhenNoDuplicate(Request request, Task task) { listOps.rightPush(getQueueKey(task), request.getUrl()); if (checkForAdditionalInfo(request)) { String field = DigestUtils.shaHex(request.getUrl()); String value = JSON.toJSONString(request); hashOps.put(getItemKey(task), field, value); } } private boolean checkForAdditionalInfo(Request request) { if (request == null) { return false; } if (!request.getHeaders().isEmpty() || !request.getCookies().isEmpty()) { return true; } if (StringUtils.isNotBlank(request.getCharset()) || StringUtils.isNotBlank(request.getMethod())) { return true; } if (request.isBinaryContent() || request.getRequestBody() != null) { return true; } if (request.getExtras() != null && !request.getExtras().isEmpty()) { return true; } if (request.getPriority() != 0L) { return true; } return false; } /**  * 删除一个  *  * @param task  */ @Override public void resetDuplicateCheck(Task task) { setOps.remove(getSetKey(task)); } /**  * 获取一个url,  *  * @param task  * @return  */ @Override public Request poll(Task task) { String url = listOps.leftPop(getQueueKey(task)); if (url == null) { return null; } String key = getItemKey(task); String field = DigestUtils.shaHex(url); String value = hashOps.get(key, field); if (value != null) { Request o = JSON.parseObject(value, Request.class); return o; } Request request = new Request(url); return request; } @Override public int getTotalRequestsCount(Task task) { Long size = setOps.size(getSetKey(task)); return size.intValue(); } /**  * 获取队列中的请求数量  *  * @param task  * @return  */ @Override public int getLeftRequestsCount(Task task) { Long size = listOps.size(getQueueKey(task)); return size.intValue(); } protected String getSetKey(Task task) { return SET_PREFIX + task.getUUID(); } protected String getQueueKey(Task task) { return QUEUE_PREFIX + task.getUUID(); } protected String getItemKey(Task task) { return ITEM_PREFIX + task.getUUID(); } } 
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

3 participants